# Tomz Weeks Word Embeddings Reanalysis

# ==============================================================================
# DESCRIPTION:
#   This script reanalyzes word embeddings to measure semantic associations between
#   race/ethnicity categories and democracy-related concepts, creating Figure 2.
# ==============================================================================


# Clear environment
rm(list = ls())

# Load necessary libraries 
library(lsa)
library(dplyr)
library(ggplot2)
set.seed(1912)

# Load pre-trained GloVe word embeddings
# RPP used GloVe 200d 
glove_vectors <- readRDS("glove_200d_vectors.rds")

# Define democracy words (from RPP)
democ_words <- c("democracy", "democratic", "democratically", "elect", "elections", "elected")

# Define lists of race words to compare with democ_words (see Supplemental Material for details)
word_lists <- list(
  # RPP's word lists
  white_rpp = c("white", "western", "caucasian", "european"),
  nonwhite_rpp = c("non-white", "non-western", "non-caucasian", "non-european"),
  
  # US Federal categories
  white_us = c("white", "english", "german", "irish", "italian", "polish", "scottish"),
  asian_us = c("asian", "chinese", "indian", "filipino", "vietnamese", "korean", "japanese"),
  black_us = c("black", "african-american", "jamaican", "haitian", "nigerian", "ethiopian", "somali"),
  hispanic_us = c("hispanic", "latino", "mexican", "puerto-rican", "salvadoran", "cuban", "dominican", "guatemalan"), 
  mena_us = c("middle-eastern", "north-african", "lebanese", "iranian", "egyptian", "syrian", "iraqi", "israeli"),
  pacific_us = c("hawaiian","samoan", "chamorro", "tongan", "fijian", "marshallese"),
  americanindian_us = c("alaskan", "navajo", "blackfeet", "inupiat", "eskimo", "aztec", "mayan")
)

# Create aggregate nonwhite category 
nonwhite_all <- unique(unlist(word_lists[!names(word_lists) %in% c("white_rpp", "nonwhite_rpp", "white_us")]))

# Calculate the mean vector for democ_words
democ_mean_vector <- colMeans(glove_vectors[democ_words, ])

# Calculate cosine similarities for word lists
results <- data.frame(ListName = character(), CosineSimilarity = numeric())

for (list_name in names(word_lists)) {
  word_list <- word_lists[[list_name]]
  word_list_mean_vector <- colMeans(glove_vectors[word_list, ])
  similarity <- cosine(as.numeric(word_list_mean_vector), as.numeric(democ_mean_vector))
  results <- rbind(results, data.frame(ListName = list_name, CosineSimilarity = similarity))
}

# Calculate cosine similarity for the nonwhite_all list
nonwhite_mean_vector <- colMeans(glove_vectors[nonwhite_all, ])
nonwhite_similarity <- cosine(as.numeric(nonwhite_mean_vector), as.numeric(democ_mean_vector))
results <- rbind(results, data.frame(ListName = "nonwhite_all", CosineSimilarity = nonwhite_similarity))

# Print the results
print(results)

# Pull out results for US lists only (omit RPP lists)
us_results <- results %>% filter(!ListName %in% c("white_rpp", "nonwhite_rpp"))

# Define US race labels
labels <- c(
  white_us = "White",
  asian_us = "Asian",
  black_us = "Black or African American",
  hispanic_us = "Hispanic or Latino",
  mena_us = "Middle Eastern or North African",
  pacific_us = "Native Hawaiian or Pacific Islander",
  americanindian_us = "American Indian",
  nonwhite_all = "Average of all non-white words"
)

# Add labels 
us_results$Label <- labels[us_results$ListName]

# Plot 
p_us <- ggplot(us_results, aes(x = CosineSimilarity, y = reorder(Label, CosineSimilarity))) +
  geom_point(size = 2) +
  theme_minimal() +
  geom_segment(aes(xend = 0, yend = reorder(Label, CosineSimilarity),), 
               linewidth = 1, alpha = 0.8) +
  geom_text(aes(label = round(CosineSimilarity, 2), 
                x = CosineSimilarity + 0.01), size = 3, hjust = 0) +
  labs(#title = "Cosine Similarity of U.S. Definitions with Democracy Words",
       x = "Cosine Similarity") +
  theme(axis.text.y = element_text(size = 10, angle = 0, hjust = 1),
        axis.title.y = element_blank(),
        axis.ticks.y = element_blank(),
        panel.grid.major.y = element_blank(),
        panel.grid.minor.y = element_blank(),
        plot.title = element_text(hjust = 0.5)) 

# Save 
ggsave("Figure2-WordEmbeddingsReanalysis.pdf", plot = p_us, width = 8.5, height = 4, units = "in")

# Display 
print(p_us)
